clear			all
set 			more off

cd 				"~/Dropbox/Indonesia Migration/" // Specify your root directory here
/*******************************************************************************
The IPUMS_download contains all the census from 1976 till 2010. 1971 does not have migration info.
********************************************************************************/
foreach			t in 1976 1980 1985 1990 1995 2000 2005 2010{
use				"Data/IPUMS_download.dta", clear
keep 			if year == `t'
recode 			geolev1 (360626 = 360054)
gen 			bpl = (bplid + country*1000) 
gen 			origin = (migid2 + country*1000)
gen				migtype = 1 if inlist(migrate5,20,30)  //This might include people returning to their home province. They are still migrants. 
recode			migtype(.=0)
gen				oldmig = 1 if (bpl != geolev1) & migtype == 0
recode			oldmig (.=0)
gen 			native = 1 if bpl == geolev1 & migtype == 0  //This is because if you have migrated in the 
recode			native (. = 0) 
gen				migstat = 0 if native == 1 
replace			migstat = 1 if migtype == 1 & missing(migstat)
replace			migstat = 2 if oldmig == 1 & missing(migstat)
label var 		migstat "Migration status"
label define	MIGSTAT 0 "Native" 1 "New Migrant" 2 "Lifetime Migrant"
label value 	migstat MIGSTAT
save			"Data/Census_`t'.dta", replace
}
/*******************************************************************************
Generating the percentage variables used to construct the IV
********************************************************************************/
foreach			t in 1976 1980 1985 1990 1995 2000 2005 2010 {
use				Data/Census_`t'.dta, clear
keep			year geolev1 geolev2 origin bpl migrate5 migtype age sex edattain perwt
label 			var geolev1 "Residence province at the time of the survey"
label 			var geolev2 "Residence district at the time of the survey"
label 			var origin "Residence province five years ago"
label 			var bpl "Province of birth"
bys				geolev2 year: egen pop_cst = total(perwt) // Resident population of each district
foreach			i of numlist 360011 360012 360013 360014 360015 360016 360017 ///
				360018 360031 360032 360033 360034 360035 360051 360052 360053 ///
				360054 360061 360062 360063 360064 360071 360072 360073 360074 ///
				360081 360094  {
gen				migrant_`i'_5Y = 1 if (origin == `i' & migtype == 1) //Origin province-specific migrant
recode			migrant_`i'_5Y (.=0)
bys				geolev2 year: egen pop_`i'ct_5Y = total(migrant_`i'_5Y*perwt) // Total migrants from an origin province in the residence district
bys				year: egen pop_`i't_5Y = total(migrant_`i'_5Y*perwt) // Total migrants from an origin provinces across all districts
generate		per_`i'c_5Y = pop_`i'ct_5Y/pop_`i't_5Y //% migrants from an origin province in different districts
}
generate		per_ict_5Y = .
foreach			i in 360011 360012 360013 360014 360015 360016 360017 ///
				360018 360031 360032 360033 360034 360035 360051 360052 360053 360054 ///
				360061 360062 360063 360064 360071 360072 360073 360074 360081 ///
				360094 {
replace			per_ict_5Y = per_`i'c_5Y if origin == `i'	//For each origin, what is the % of migrants from that origin in the current city		
}
egen 			total_immig_cst_5Y = rowtotal(pop_360011ct_5Y pop_360012ct_5Y /// Total predicted migrant in the city is
				pop_360013ct_5Y pop_360014ct_5Y pop_360015ct_5Y pop_360016ct_5Y /// the sum of migrants from all oirgin provinces
				pop_360017ct_5Y pop_360018ct_5Y pop_360031ct_5Y pop_360032ct_5Y ///
				pop_360033ct_5Y pop_360034ct_5Y pop_360035ct_5Y pop_360051ct_5Y ///
				pop_360052ct_5Y pop_360053ct_5Y pop_360054ct_5Y pop_360061ct_5Y ///
				pop_360062ct_5Y pop_360063ct_5Y pop_360064ct_5Y pop_360071ct_5Y ///
				pop_360072ct_5Y pop_360073ct_5Y pop_360074ct_5Y pop_360081ct_5Y ///
				pop_360094ct_5Y)		
bys				geolev2 year: gen per_immig_cst_5Y = total_immig_cst_5Y/pop_cst // As a percentage of the total population
gen				mvmt = 1 if inlist(migrate5,20,30) // across province move
recode			mvmt(.=0)
keep 			year geolev2 origin migrate5 per_ict_5Y per_immig_cst_5Y ///
				pop_cst migtype age sex edattain geolev1 mvmt perwt
label 			var migtype "New/recent across-province migrant (old migrants, natives = 0)"
label 			var per_ict_5Y "% migrants in the residence district from origin province"
label 			var per_immig_cst_5Y "Total new migrants in the residence district"
label 			var pop_cst "Residence district population"
label 			var mvmt "New/recent across-province migrant (old migrants, natives = 0)"
tempfile		`t'_all_5Y_weight
save			``t'_all_5Y_weight' // Respondent-level survey year files					
duplicates		drop geolev2 origin year, force
tempfile		a_`t'_pop_ict			
save			`a_`t'_pop_ict' // origin-destination pair-level survey year files	
}

use				`a_1976_pop_ict', clear
foreach			t in 1980 1985 1990 1995 2000 2005 2010{
append			using `a_`t'_pop_ict'
}
tempfile		appended_pop_ict_weight
save			`appended_pop_ict_weight' // origin-destination pair-level all years files	
/*******************************************************************************
End of generating the percentage variables used to construct the IV
********************************************************************************/
/*******************************************************************************
Matching each migrant with their percentage (origin-destination) group. 
********************************************************************************/
foreach			t in 1976 1980 1985 1990 1995 2000 2005 2010{
use				``t'_all_5Y_weight', clear // Respondent-level survey year files
tempfile		all 
save			`all' // Respondent-level all years files

drop			if mvmt == 0 //calculating outflow between t and t-1 from each province
bys 			year origin: egen outflow_it = total(mvmt*perwt) // Outflow from origin province
bys				year origin geolev2: egen settle_ict = total(mvmt*perwt) // Outflow from origin province to each current residence districts
generate		outflow_LO = outflow_it - settle_ict // The difference for each residence district. If do not want to use the difference, just remove this line
duplicates		drop origin year geolev2, force
keep			origin year geolev2 outflow_LO // origin-destination pair-level all years files for outflows
tempfile		migrants
save			`migrants', replace

use 			`all', clear // Respondent-level all years files
merge			m:1 origin year geolev2 using `migrants', gen(inst0) // all migrants matched with the outflow from their origin province
/*******************************************************************************
Constructing control variables at the level of present residence
********************************************************************************/
generate		high_educ = 1 if inrange(edattain,3,4)
recode			high_educ (.=0)
bys				geolev2 year: egen av_high_educ_ct = mean(high_educ*perwt)
generate		male = 1 if sex == 1
replace			male = 0 if inrange(sex,2,9)
bys				geolev2 year: egen prop_male_ct = mean(male*perwt)
bys				geolev2 year: egen av_age_ct = mean(age*perwt)
duplicates		drop origin geolev2 year, force
label 			var av_high_educ_ct "% district pop with secondary or tertiary education"
label 			var prop_male_ct "% district male population"
label 			var av_age_ct "Average age of the district's population"
tempfile		a_`t'_instrument
save			`a_`t'_instrument'
}
/*******************************************************************************
Constructing the IV variables
********************************************************************************/
use				`a_1976_instrument', clear
foreach			t in 1980 1985 1990 1995 2000 2005 2010{
append			using `a_`t'_instrument'
}
merge 1:1 		origin geolev2 year using `appended_pop_ict_weight', gen(inst)
replace			year = 1975 if year == 1976
*5-year lag IV variable
sort			geolev2 origin year
bys				geolev2 origin: gen per_mig_ict_lag_5Y = per_ict_5Y if (year == year[_n-1] + 5)
generate		num_indiv_5Y_5Y = per_mig_ict_lag_5Y * outflow_LO // Prediction from one origin = past alpha * present outflow from the province
bys				geolev2 year: egen pred_total_immig_cst_5Y_5Y = sum(num_indiv_5Y_5Y) // summed across all origins
bys				geolev2 year: gen pred_per_immig_cst_5Y_5Y = pred_total_immig_cst_5Y_5Y/pop_cst 
replace			pred_per_immig_cst_5Y_5Y = . if year == 1975
*10-year lag IV variable
sort			geolev2 origin year
bys				geolev2 origin: gen per_mig_ict_lag2_5Y = per_ict_5Y if (year == year[_n-2] + 10)
generate		num_indiv_10Y_5Y = per_mig_ict_lag2_5Y * outflow_LO
bys				geolev2 year: egen pred_total_immig_cst_10Y_5Y = sum(num_indiv_10Y_5Y) 
generate		pred_per_immig_cst_10Y_5Y = pred_total_immig_cst_10Y_5Y/pop_cst 
replace			pred_per_immig_cst_10Y_5Y = . if inlist(year,1975,1980)

duplicates		drop geolev2 year, force // District-level migrant inflow file
xtset           geolev2 year, delta(5)

gen             per_immig_cst_5Y_lag = l.per_immig_cst_5Y
gen            	pred_per_immig_cst_Jaeger_10Y = l.pred_per_immig_cst_5Y_5Y

label variable	per_immig_cst_5Y "Migrants (%) between $t$ and $t-5$"
label variable	per_immig_cst_5Y_lag "Migrants (%) between $t-5$ and $t-10$"
label variable	pred_per_immig_cst_5Y_5Y "Predicted migrants (%) between $t$ and $t-5$ based on migrants settlement 5 years ago"
label variable	pred_per_immig_cst_10Y_5Y "Predicted migrants (%) between $t$ and $t-5$ based on migrants settlement 10 years ago"
label variable	pred_per_immig_cst_Jaeger_10Y "Predicted migrants (%) between $t-5$ and $t-10$ based on migrants settlement 5 years ago"

save			"Data/Instrumental variable", replace


"
/*******************************************************************************
********************************************************************************/
/*******************************************************************************
					CONSTRUCTING PROVINCE-LEVEL IV
********************************************************************************/
/*******************************************************************************
********************************************************************************/
/*******************************************************************************
Generating the percentage variables used to construct the IV
********************************************************************************/
foreach			t in 1976 1980 1985 1990 1995 2000 2005 2010 {
use				Data/Census_`t'.dta, clear
keep			year geolev1 origin bpl migrate5 migtype age sex edattain perwt
label 			var geolev1 "Residence province at the time of the survey"
label 			var origin "Residence province five years ago"
label 			var bpl "Province of birth"
bys				geolev1 year: egen pop_cst = total(perwt) // Resident population of each district
foreach			i of numlist 360011 360012 360013 360014 360015 360016 360017 ///
				360018 360031 360032 360033 360034 360035 360051 360052 360053 ///
				360054 360061 360062 360063 360064 360071 360072 360073 360074 ///
				360081 360094  {
gen				migrant_`i'_5Y = 1 if (origin == `i' & migtype == 1) //Origin province-specific migrant
recode			migrant_`i'_5Y (.=0)
bys				geolev1 year: egen pop_`i'ct_5Y = total(migrant_`i'_5Y*perwt) // Total migrants from an origin province in the residence district
bys				year: egen pop_`i't_5Y = total(migrant_`i'_5Y*perwt) // Total migrants from an origin provinces across all districts
generate		per_`i'c_5Y = pop_`i'ct_5Y/pop_`i't_5Y //% migrants from an origin province in different districts
}
generate		per_ict_5Y = .
foreach			i in 360011 360012 360013 360014 360015 360016 360017 ///
				360018 360031 360032 360033 360034 360035 360051 360052 360053 360054 ///
				360061 360062 360063 360064 360071 360072 360073 360074 360081 ///
				360094 {
replace			per_ict_5Y = per_`i'c_5Y if origin == `i'	//For each origin, what is the % of migrants from that origin in the current city		
}
egen 			total_immig_cst_5Y = rowtotal(pop_360011ct_5Y pop_360012ct_5Y /// Total predicted migrant in the city is
				pop_360013ct_5Y pop_360014ct_5Y pop_360015ct_5Y pop_360016ct_5Y /// the sum of migrants from all oirgin provinces
				pop_360017ct_5Y pop_360018ct_5Y pop_360031ct_5Y pop_360032ct_5Y ///
				pop_360033ct_5Y pop_360034ct_5Y pop_360035ct_5Y pop_360051ct_5Y ///
				pop_360052ct_5Y pop_360053ct_5Y pop_360054ct_5Y pop_360061ct_5Y ///
				pop_360062ct_5Y pop_360063ct_5Y pop_360064ct_5Y pop_360071ct_5Y ///
				pop_360072ct_5Y pop_360073ct_5Y pop_360074ct_5Y pop_360081ct_5Y ///
				pop_360094ct_5Y)		
bys				geolev1 year: gen per_immig_cst_5Y = total_immig_cst_5Y/pop_cst // As a percentage of the total population
gen				mvmt = 1 if inlist(migrate5,20,30) // across province move
recode			mvmt(.=0)
keep 			year geolev1 origin migrate5 per_ict_5Y per_immig_cst_5Y ///
				pop_cst migtype age sex edattain geolev1 mvmt perwt
label 			var migtype "New/recent across-province migrant (old migrants, natives = 0)"
label 			var per_ict_5Y "% migrants in the residence province from origin province"
label 			var per_immig_cst_5Y "Total new migrants in the residence province"
label 			var pop_cst "Residence province population"
label 			var mvmt "New/recent across-province migrant (old migrants, natives = 0)"
tempfile		`t'_all_5Y_weight
save			``t'_all_5Y_weight' // Respondent-level survey year files					
duplicates		drop geolev1 origin year, force
tempfile		a_`t'_pop_ict			
save			`a_`t'_pop_ict' // origin-destination pair-level survey year files	
}

use				`a_1976_pop_ict', clear
foreach			t in 1980 1985 1990 1995 2000 2005 2010{
append			using `a_`t'_pop_ict'
}
tempfile		appended_pop_ict_weight
save			`appended_pop_ict_weight' // origin-destination pair-level all years files	
/*******************************************************************************
End of generating the percentage variables used to construct the IV
********************************************************************************/
/*******************************************************************************
Matching each migrant with their percentage (origin-destination) group. 
********************************************************************************/
foreach			t in 1976 1980 1985 1990 1995 2000 2005 2010{
use				``t'_all_5Y_weight', clear // Respondent-level survey year files
tempfile		all 
save			`all' // Respondent-level all years files

drop			if mvmt == 0 //calculating outflow between t and t-1 from each province
bys 			year origin: egen outflow_it = total(mvmt*perwt) // Outflow from origin province
bys				year origin geolev1: egen settle_ict = total(mvmt*perwt) // Outflow from origin province to each current residence provinces
generate		outflow_LO = outflow_it - settle_ict // The difference for each residence district. If do not want to use the difference, just remove this line
duplicates		drop origin year geolev1, force
keep			origin year geolev1 outflow_LO // origin-destination pair-level all years files for outflows
tempfile		migrants
save			`migrants', replace

use 			`all', clear // Respondent-level all years files
merge			m:1 origin year geolev1 using `migrants', gen(inst0) // all migrants matched with the outflow from their origin province
/*******************************************************************************
Constructing control variables at the level of present residence
********************************************************************************/
generate		high_educ = 1 if inrange(edattain,3,4)
recode			high_educ (.=0)
bys				geolev1 year: egen av_high_educ_ct = mean(high_educ*perwt)
generate		male = 1 if sex == 1
replace			male = 0 if inrange(sex,2,9)
bys				geolev1 year: egen prop_male_ct = mean(male*perwt)
bys				geolev1 year: egen av_age_ct = mean(age*perwt)
duplicates		drop origin geolev1 year, force
label 			var av_high_educ_ct "% province pop with secondary or tertiary education"
label 			var prop_male_ct "% province male population"
label 			var av_age_ct "Average age of the province's population"
tempfile		a_`t'_instrument
save			`a_`t'_instrument'
}
/*******************************************************************************
Constructing the IV variables
********************************************************************************/
use				`a_1976_instrument', clear
foreach			t in 1980 1985 1990 1995 2000 2005 2010{
append			using `a_`t'_instrument'
}
merge 1:1 		origin geolev1 year using `appended_pop_ict_weight', gen(inst)
replace			year = 1975 if year == 1976
*5-year lag IV variable
sort			geolev1 origin year
bys				geolev1 origin: gen per_mig_ict_lag_5Y = per_ict_5Y if (year == year[_n-1] + 5)
generate		num_indiv_5Y_5Y = per_mig_ict_lag_5Y * outflow_LO // Prediction from one origin = past alpha * present outflow from the province
bys				geolev1 year: egen pred_total_immig_cst_5Y_5Y = sum(num_indiv_5Y_5Y) // summed across all origins
bys				geolev1 year: gen pred_per_immig_cst_5Y_5Y = pred_total_immig_cst_5Y_5Y/pop_cst 
replace			pred_per_immig_cst_5Y_5Y = . if year == 1975
*10-year lag IV variable
sort			geolev1 origin year
bys				geolev1 origin: gen per_mig_ict_lag2_5Y = per_ict_5Y if (year == year[_n-2] + 10)
generate		num_indiv_10Y_5Y = per_mig_ict_lag2_5Y * outflow_LO
bys				geolev1 year: egen pred_total_immig_cst_10Y_5Y = sum(num_indiv_10Y_5Y) 
generate		pred_per_immig_cst_10Y_5Y = pred_total_immig_cst_10Y_5Y/pop_cst 
replace			pred_per_immig_cst_10Y_5Y = . if inlist(year,1975,1980)

duplicates		drop geolev1 year, force // District-level migrant inflow file
xtset           geolev1 year, delta(5)

gen             per_immig_cst_5Y_lag = l.per_immig_cst_5Y
gen            	pred_per_immig_cst_Jaeger_10Y = l.pred_per_immig_cst_5Y_5Y

label variable	per_immig_cst_5Y "Migrants (%) between $t$ and $t-5$"
label variable	per_immig_cst_5Y_lag "Migrants (%) between $t-5$ and $t-10$"
label variable	pred_per_immig_cst_5Y_5Y "Predicted migrants (%) between $t$ and $t-5$ based on migrants settlement 5 years ago"
label variable	pred_per_immig_cst_10Y_5Y "Predicted migrants (%) between $t$ and $t-5$ based on migrants settlement 10 years ago"
label variable	pred_per_immig_cst_Jaeger_10Y "Predicted migrants (%) between $t-5$ and $t-10$ based on migrants settlement 5 years ago"

save			"Data/Instrumental variable province level", replace





